From ee9d4d3801580368248e831a82afc4e1ed4f1300 Mon Sep 17 00:00:00 2001 From: "kaf24@scramble.cl.cam.ac.uk" Date: Thu, 3 Feb 2005 14:45:50 +0000 Subject: [PATCH] bitkeeper revision 1.1159.212.78 (4202391ehUS0T4TJglUpPqBH3oGjNQ) Move domain builder to be subarch-specific. Fix pfn_info structure and page reference-counting to be 64-bit clean. Signed-off-by: keir.fraser@cl.cam.ac.uk --- .rootkeys | 2 + xen/arch/x86/domain.c | 375 --------------------------- xen/arch/x86/memory.c | 42 ++-- xen/arch/x86/shadow.c | 6 +- xen/arch/x86/x86_32/domain_build.c | 389 ++++++++++++++++++++++++++++ xen/arch/x86/x86_32/mm.c | 8 +- xen/arch/x86/x86_64/domain_build.c | 391 +++++++++++++++++++++++++++++ xen/arch/x86/x86_64/mm.c | 19 +- xen/common/page_alloc.c | 8 +- xen/include/asm-x86/mm.h | 44 +++- xen/include/asm-x86/shadow.h | 2 +- 11 files changed, 863 insertions(+), 423 deletions(-) create mode 100644 xen/arch/x86/x86_32/domain_build.c create mode 100644 xen/arch/x86/x86_64/domain_build.c diff --git a/.rootkeys b/.rootkeys index 841c9cac04..d2a1df79ea 100644 --- a/.rootkeys +++ b/.rootkeys @@ -897,6 +897,7 @@ 41f97ef5139vN42cOYHfX_Ac8WOOjA xen/arch/x86/vmx_platform.c 41c0c4128URE0dxcO15JME_MuKBPfg xen/arch/x86/vmx_vmcs.c 419cbedeQDg8IrO3izo3o5rQNlo0kQ xen/arch/x86/x86_32/asm-offsets.c +4202391dkvdTZ8GhWXe3Gqf9EOgWXg xen/arch/x86/x86_32/domain_build.c 3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/arch/x86/x86_32/domain_page.c 3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/x86/x86_32/entry.S 3ddb79bcHwuCQDjBICDTSis52hWguw xen/arch/x86/x86_32/mm.c @@ -905,6 +906,7 @@ 3ddb79bc4nTpGQOe6_-MbyZzkhlhFQ xen/arch/x86/x86_32/usercopy.c 3ddb79bcOMCu9-5mKpjIh5d0qqBDPg xen/arch/x86/x86_32/xen.lds 41bf1717Ty3hwN3E9swdu8QfnvGqww xen/arch/x86/x86_64/asm-offsets.c +4202391dA91ZovYX9d_5zJi9yGvLoQ xen/arch/x86/x86_64/domain_build.c 40e96d3aLDI-nViMuYneD7VKYlZrVg xen/arch/x86/x86_64/entry.S 41bf1717XhPz_dNT5OKSjgmbFuWBuA xen/arch/x86/x86_64/mm.c 42000d3cMb8o1WuFBXC07c8i3lPZBw xen/arch/x86/x86_64/traps.c diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index fe7225861b..d13d765843 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -43,20 +43,6 @@ static int opt_noreboot = 0; boolean_param("noreboot", opt_noreboot); -#if !defined(CONFIG_X86_64BITMODE) -/* No ring-3 access in initial page tables. */ -#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) -#else -/* Allow ring-3 access in long mode as guest cannot use ring 1. */ -#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER) -#endif -#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) -#define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) -#define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) - -#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) -#define round_pgdown(_p) ((_p)&PAGE_MASK) - static void default_idle(void) { __cli(); @@ -795,364 +781,3 @@ void domain_relinquish_memory(struct domain *d) relinquish_list(d, &d->page_list); } - -int construct_dom0(struct domain *p, - unsigned long alloc_start, - unsigned long alloc_end, - char *image_start, unsigned long image_len, - char *initrd_start, unsigned long initrd_len, - char *cmdline) -{ - char *dst; - int i, rc; - unsigned long pfn, mfn; - unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT; - unsigned long nr_pt_pages; - unsigned long count; - l2_pgentry_t *l2tab, *l2start; - l1_pgentry_t *l1tab = NULL, *l1start = NULL; - struct pfn_info *page = NULL; - start_info_t *si; - struct exec_domain *ed = p->exec_domain[0]; - - /* - * This fully describes the memory layout of the initial domain. All - * *_start address are page-aligned, except v_start (and v_end) which are - * superpage-aligned. - */ - struct domain_setup_info dsi; - unsigned long vinitrd_start; - unsigned long vinitrd_end; - unsigned long vphysmap_start; - unsigned long vphysmap_end; - unsigned long vstartinfo_start; - unsigned long vstartinfo_end; - unsigned long vstack_start; - unsigned long vstack_end; - unsigned long vpt_start; - unsigned long vpt_end; - unsigned long v_end; - - /* Machine address of next candidate page-table page. */ - unsigned long mpt_alloc; - - extern void physdev_init_dom0(struct domain *); - - /* Sanity! */ - if ( p->id != 0 ) - BUG(); - if ( test_bit(DF_CONSTRUCTED, &p->d_flags) ) - BUG(); - - memset(&dsi, 0, sizeof(struct domain_setup_info)); - - printk("*** LOADING DOMAIN 0 ***\n"); - - /* - * This is all a bit grim. We've moved the modules to the "safe" physical - * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this - * routine we're going to copy it down into the region that's actually - * been allocated to domain 0. This is highly likely to be overlapping, so - * we use a forward copy. - * - * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with - * 4GB and lots of network/disk cards that allocate loads of buffers. - * We'll have to revisit this if we ever support PAE (64GB). - */ - - rc = parseelfimage(image_start, image_len, &dsi); - if ( rc != 0 ) - return rc; - - /* Set up domain options */ - if ( dsi.use_writable_pagetables ) - vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); - - if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 ) - { - printk("Initial guest OS must load to a page boundary.\n"); - return -EINVAL; - } - - /* - * Why do we need this? The number of page-table frames depends on the - * size of the bootstrap address space. But the size of the address space - * depends on the number of page-table frames (since each one is mapped - * read-only). We have a pair of simultaneous equations in two unknowns, - * which we solve by exhaustive search. - */ - vinitrd_start = round_pgup(dsi.v_kernend); - vinitrd_end = vinitrd_start + initrd_len; - vphysmap_start = round_pgup(vinitrd_end); - vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long)); - vpt_start = round_pgup(vphysmap_end); - for ( nr_pt_pages = 2; ; nr_pt_pages++ ) - { - vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); - vstartinfo_start = vpt_end; - vstartinfo_end = vstartinfo_start + PAGE_SIZE; - vstack_start = vstartinfo_end; - vstack_end = vstack_start + PAGE_SIZE; - v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1); - if ( (v_end - vstack_end) < (512 << 10) ) - v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */ - if ( (((v_end - dsi.v_start + ((1<> - L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) - break; - } - - printk("PHYSICAL MEMORY ARRANGEMENT:\n" - " Kernel image: %p->%p\n" - " Initrd image: %p->%p\n" - " Dom0 alloc.: %08lx->%08lx\n", - image_start, image_start + image_len, - initrd_start, initrd_start + initrd_len, - alloc_start, alloc_end); - printk("VIRTUAL MEMORY ARRANGEMENT:\n" - " Loaded kernel: %08lx->%08lx\n" - " Init. ramdisk: %08lx->%08lx\n" - " Phys-Mach map: %08lx->%08lx\n" - " Page tables: %08lx->%08lx\n" - " Start info: %08lx->%08lx\n" - " Boot stack: %08lx->%08lx\n" - " TOTAL: %08lx->%08lx\n", - dsi.v_kernstart, dsi.v_kernend, - vinitrd_start, vinitrd_end, - vphysmap_start, vphysmap_end, - vpt_start, vpt_end, - vstartinfo_start, vstartinfo_end, - vstack_start, vstack_end, - dsi.v_start, v_end); - printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry); - - if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) ) - { - printk("Initial guest OS requires too much space\n" - "(%luMB is greater than %luMB limit)\n", - (v_end-dsi.v_start)>>20, (nr_pages<>20); - return -ENOMEM; - } - - /* - * Protect the lowest 1GB of memory. We use a temporary mapping there - * from which we copy the kernel and ramdisk images. - */ - if ( dsi.v_start < (1<<30) ) - { - printk("Initial loading isn't allowed to lowest 1GB of memory.\n"); - return -EINVAL; - } - - /* Paranoia: scrub DOM0's memory allocation. */ - printk("Scrubbing DOM0 RAM: "); - dst = (char *)alloc_start; - while ( dst < (char *)alloc_end ) - { -#define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */ - printk("."); - touch_nmi_watchdog(); - if ( ((char *)alloc_end - dst) > SCRUB_BYTES ) - { - memset(dst, 0, SCRUB_BYTES); - dst += SCRUB_BYTES; - } - else - { - memset(dst, 0, (char *)alloc_end - dst); - break; - } - } - printk("done.\n"); - - /* Construct a frame-allocation list for the initial domain. */ - for ( mfn = (alloc_start>>PAGE_SHIFT); - mfn < (alloc_end>>PAGE_SHIFT); - mfn++ ) - { - page = &frame_table[mfn]; - page->u.inuse.domain = p; - page->u.inuse.type_info = 0; - page->count_info = PGC_allocated | 1; - list_add_tail(&page->list, &p->page_list); - p->tot_pages++; p->max_pages++; - } - - mpt_alloc = (vpt_start - dsi.v_start) + alloc_start; - - SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES); - SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS); - - /* - * We're basically forcing default RPLs to 1, so that our "what privilege - * level are we returning to?" logic works. - */ - ed->thread.failsafe_selector = FLAT_GUESTOS_CS; - ed->thread.event_selector = FLAT_GUESTOS_CS; - ed->thread.guestos_ss = FLAT_GUESTOS_DS; - for ( i = 0; i < 256; i++ ) - ed->thread.traps[i].cs = FLAT_GUESTOS_CS; - - /* WARNING: The new domain must have its 'processor' field filled in! */ - l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; - memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE); - l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR); - l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR); - ed->mm.pagetable = mk_pagetable((unsigned long)l2start); - - l2tab += l2_table_offset(dsi.v_start); - mfn = alloc_start >> PAGE_SHIFT; - for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ ) - { - if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) - { - l1start = l1tab = (l1_pgentry_t *)mpt_alloc; - mpt_alloc += PAGE_SIZE; - *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT); - clear_page(l1tab); - if ( count == 0 ) - l1tab += l1_table_offset(dsi.v_start); - } - *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT); - - page = &frame_table[mfn]; - if ( !get_page_and_type(page, p, PGT_writable_page) ) - BUG(); - - mfn++; - } - - /* Pages that are part of page tables must be read only. */ - l2tab = l2start + l2_table_offset(vpt_start); - l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab); - l1tab += l1_table_offset(vpt_start); - l2tab++; - for ( count = 0; count < nr_pt_pages; count++ ) - { - *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); - page = &frame_table[l1_pgentry_to_pagenr(*l1tab)]; - if ( count == 0 ) - { - page->u.inuse.type_info &= ~PGT_type_mask; - page->u.inuse.type_info |= PGT_l2_page_table; - - /* - * No longer writable: decrement the type_count. - * Installed as CR3: increment both the ref_count and type_count. - * Net: just increment the ref_count. - */ - get_page(page, p); /* an extra ref because of readable mapping */ - - /* Get another ref to L2 page so that it can be pinned. */ - if ( !get_page_and_type(page, p, PGT_l2_page_table) ) - BUG(); - set_bit(_PGT_pinned, &page->u.inuse.type_info); - } - else - { - page->u.inuse.type_info &= ~PGT_type_mask; - page->u.inuse.type_info |= PGT_l1_page_table; - page->u.inuse.type_info |= - ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<shared_info->domain_time = 0; - /* Mask all upcalls... */ - for ( i = 0; i < MAX_VIRT_CPUS; i++ ) - p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; - p->shared_info->n_vcpu = smp_num_cpus; - - /* Install the new page tables. */ - __cli(); - write_ptbase(&ed->mm); - - /* Copy the OS image. */ - (void)loadelfimage(image_start); - - /* Copy the initial ramdisk. */ - if ( initrd_len != 0 ) - memcpy((void *)vinitrd_start, initrd_start, initrd_len); - - /* Set up start info area. */ - si = (start_info_t *)vstartinfo_start; - memset(si, 0, PAGE_SIZE); - si->nr_pages = p->tot_pages; - si->shared_info = virt_to_phys(p->shared_info); - si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; - si->pt_base = vpt_start; - si->nr_pt_frames = nr_pt_pages; - si->mfn_list = vphysmap_start; - - /* Write the phys->machine and machine->phys table entries. */ - for ( pfn = 0; pfn < p->tot_pages; pfn++ ) - { - mfn = pfn + (alloc_start>>PAGE_SHIFT); -#ifndef NDEBUG -#define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT) - if ( pfn > REVERSE_START ) - mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START); -#endif - ((unsigned long *)vphysmap_start)[pfn] = mfn; - machine_to_phys_mapping[mfn] = pfn; - } - - if ( initrd_len != 0 ) - { - si->mod_start = vinitrd_start; - si->mod_len = initrd_len; - printk("Initrd len 0x%lx, start at 0x%08lx\n", - si->mod_len, si->mod_start); - } - - dst = si->cmd_line; - if ( cmdline != NULL ) - { - for ( i = 0; i < 255; i++ ) - { - if ( cmdline[i] == '\0' ) - break; - *dst++ = cmdline[i]; - } - } - *dst = '\0'; - - /* Reinstate the caller's page tables. */ - write_ptbase(¤t->mm); - __sti(); - - /* Destroy low mappings - they were only for our convenience. */ - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) - if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE ) - l2start[i] = mk_l2_pgentry(0); - zap_low_mappings(); /* Do the same for the idle page tables. */ - - /* DOM0 gets access to everything. */ - physdev_init_dom0(p); - - set_bit(DF_CONSTRUCTED, &p->d_flags); - - new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start); - -#if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */ - shadow_lock(&p->mm); - shadow_mode_enable(p, SHM_test); - shadow_unlock(&p->mm); -#endif - - return 0; -} diff --git a/xen/arch/x86/memory.c b/xen/arch/x86/memory.c index e855210973..44e1275e54 100644 --- a/xen/arch/x86/memory.c +++ b/xen/arch/x86/memory.c @@ -444,7 +444,7 @@ static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) ) return; - e = page->u.inuse.domain; + e = page_get_owner(page); if ( unlikely(e != d) ) { /* @@ -493,7 +493,7 @@ static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) static int alloc_l2_table(struct pfn_info *page) { - struct domain *d = page->u.inuse.domain; + struct domain *d = page_get_owner(page); unsigned long page_nr = page_to_pfn(page); l2_pgentry_t *pl2e; int i; @@ -512,7 +512,7 @@ static int alloc_l2_table(struct pfn_info *page) pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) | + mk_l2_pgentry(__pa(page_get_owner(page)->mm_perdomain_pt) | __PAGE_HYPERVISOR); #endif @@ -530,7 +530,7 @@ static int alloc_l2_table(struct pfn_info *page) static int alloc_l1_table(struct pfn_info *page) { - struct domain *d = page->u.inuse.domain; + struct domain *d = page_get_owner(page); unsigned long page_nr = page_to_pfn(page); l1_pgentry_t *pl1e; int i; @@ -570,7 +570,7 @@ static void free_l2_table(struct pfn_info *page) static void free_l1_table(struct pfn_info *page) { - struct domain *d = page->u.inuse.domain; + struct domain *d = page_get_owner(page); unsigned long page_nr = page - frame_table; l1_pgentry_t *pl1e; int i; @@ -731,7 +731,7 @@ int alloc_page_type(struct pfn_info *page, unsigned int type) void free_page_type(struct pfn_info *page, unsigned int type) { - struct domain *d = page->u.inuse.domain; + struct domain *d = page_get_owner(page); switch ( type ) { @@ -774,7 +774,7 @@ void put_page_type(struct pfn_info *page) * See domain.c:relinquish_list(). */ ASSERT((x & PGT_validated) || - test_bit(DF_DYING, &page->u.inuse.domain->d_flags)); + test_bit(DF_DYING, &page_get_owner(page)->d_flags)); if ( unlikely((nx & PGT_count_mask) == 0) ) { @@ -832,7 +832,7 @@ int get_page_type(struct pfn_info *page, u32 type) * may be unnecessary (e.g., page was GDT/LDT) but those * circumstances should be very rare. */ - struct domain *d = page->u.inuse.domain; + struct domain *d = page_get_owner(page); if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor], page->tlbflush_timestamp)) ) { @@ -987,7 +987,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) ) { MEM_LOG("Page %08lx bad domain (dom=%p)", - ptr, page->u.inuse.domain); + ptr, page_get_owner(page)); } else if ( likely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) ) @@ -1117,7 +1117,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) * benign reference to the page (PGC_allocated). If that reference * disappears then the deallocation routine will safely spin. */ - nd = page->u.inuse.domain; + nd = page_get_owner(page); y = page->count_info; do { x = y; @@ -1173,7 +1173,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) if ( unlikely(e->tot_pages++ == 0) ) get_knownalive_domain(e); list_add_tail(&page->list, &e->page_list); - page->u.inuse.domain = e; + page_set_owner(page, e); spin_unlock(&e->page_alloc_lock); @@ -1229,7 +1229,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) * benign reference to the page (PGC_allocated). If that reference * disappears then the deallocation routine will safely spin. */ - nd = page->u.inuse.domain; + nd = page_get_owner(page); y = page->count_info; do { x = y; @@ -2072,7 +2072,7 @@ void audit_domain(struct domain *d) pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; page = &frame_table[pfn]; - if ( page->u.inuse.domain != d ) + if ( page_get_owner(page) != d ) BUG(); if ( (page->u.inuse.type_info & PGT_count_mask) > @@ -2118,7 +2118,7 @@ void audit_domain(struct domain *d) pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; page = &frame_table[pfn]; - if ( page->u.inuse.domain != d ) + if ( page_get_owner(page) != d ) BUG(); switch ( page->u.inuse.type_info & PGT_type_mask ) @@ -2144,10 +2144,10 @@ void audit_domain(struct domain *d) unsigned long l1pfn = pt[i]>>PAGE_SHIFT; struct pfn_info *l1page = &frame_table[l1pfn]; - if ( l1page->u.inuse.domain != d ) + if ( page_get_owner(l1page) != d ) { printk("L2: Skip bizarre page belonging to other " - "dom %p\n", l1page->u.inuse.domain); + "dom %p\n", page_get_owner(l1page)); continue; } @@ -2222,12 +2222,12 @@ void audit_domain(struct domain *d) } - if ( l1page->u.inuse.domain != d ) + if ( page_get_owner(l1page) != d ) { - printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx " + printk("Audit %d: [%lx,%x] Skip foreign page dom=%p " "pfn=%lx c=%08x t=%08x m2p=%lx\n", d->id, pfn, i, - (unsigned long)l1page->u.inuse.domain, + page_get_owner(l1page), l1pfn, l1page->count_info, l1page->u.inuse.type_info, @@ -2312,7 +2312,7 @@ void audit_domain(struct domain *d) unsigned long l1pfn = pt[i]>>PAGE_SHIFT; struct pfn_info *l1page = &frame_table[l1pfn]; - if ( l1page->u.inuse.domain == d) + if ( page_get_owner(l1page) == d ) adjust(l1page, 1, 1); } } @@ -2333,7 +2333,7 @@ void audit_domain(struct domain *d) unsigned long l1pfn = pt[i]>>PAGE_SHIFT; struct pfn_info *l1page = &frame_table[l1pfn]; - if ( (l1page->u.inuse.domain != d) || + if ( (page_get_owner(l1page) != d) || (l1pfn < 0x100) || (l1pfn > max_page) ) continue; diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index c2fcd00779..fb1ded8945 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -420,7 +420,7 @@ static inline struct pfn_info *alloc_shadow_page(struct mm_struct *m) void unshadow_table(unsigned long gpfn, unsigned int type) { unsigned long spfn; - struct domain *d = frame_table[gpfn].u.inuse.domain; + struct domain *d = page_get_owner(&frame_table[gpfn]); SH_VLOG("unshadow_table type=%08x gpfn=%08lx", type, gpfn); @@ -494,7 +494,7 @@ unsigned long shadow_l2_table( spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm_perdomain_pt) | + mk_l2_pgentry(__pa(page_get_owner(&frame_table[gpfn])->mm_perdomain_pt) | __PAGE_HYPERVISOR); } #endif @@ -924,7 +924,7 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s) if (m->shadow_mode != SHM_full_32) { if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) != - ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | + ((__pa(page_get_owner(&frame_table[gpfn])->mm.perdomain_pt) | __PAGE_HYPERVISOR))) ) FAILPT("hypervisor per-domain map inconsistent"); } diff --git a/xen/arch/x86/x86_32/domain_build.c b/xen/arch/x86/x86_32/domain_build.c new file mode 100644 index 0000000000..d1b30ec442 --- /dev/null +++ b/xen/arch/x86/x86_32/domain_build.c @@ -0,0 +1,389 @@ +/****************************************************************************** + * domain_build.c + * + * Copyright (c) 2002-2005, K A Fraser + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* No ring-3 access in initial page tables. */ +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) + +#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) +#define round_pgdown(_p) ((_p)&PAGE_MASK) + +int construct_dom0(struct domain *p, + unsigned long alloc_start, + unsigned long alloc_end, + char *image_start, unsigned long image_len, + char *initrd_start, unsigned long initrd_len, + char *cmdline) +{ + char *dst; + int i, rc; + unsigned long pfn, mfn; + unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT; + unsigned long nr_pt_pages; + unsigned long count; + l2_pgentry_t *l2tab, *l2start; + l1_pgentry_t *l1tab = NULL, *l1start = NULL; + struct pfn_info *page = NULL; + start_info_t *si; + struct exec_domain *ed = p->exec_domain[0]; + + /* + * This fully describes the memory layout of the initial domain. All + * *_start address are page-aligned, except v_start (and v_end) which are + * superpage-aligned. + */ + struct domain_setup_info dsi; + unsigned long vinitrd_start; + unsigned long vinitrd_end; + unsigned long vphysmap_start; + unsigned long vphysmap_end; + unsigned long vstartinfo_start; + unsigned long vstartinfo_end; + unsigned long vstack_start; + unsigned long vstack_end; + unsigned long vpt_start; + unsigned long vpt_end; + unsigned long v_end; + + /* Machine address of next candidate page-table page. */ + unsigned long mpt_alloc; + + extern void physdev_init_dom0(struct domain *); + + /* Sanity! */ + if ( p->id != 0 ) + BUG(); + if ( test_bit(DF_CONSTRUCTED, &p->d_flags) ) + BUG(); + + memset(&dsi, 0, sizeof(struct domain_setup_info)); + + printk("*** LOADING DOMAIN 0 ***\n"); + + /* + * This is all a bit grim. We've moved the modules to the "safe" physical + * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this + * routine we're going to copy it down into the region that's actually + * been allocated to domain 0. This is highly likely to be overlapping, so + * we use a forward copy. + * + * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with + * 4GB and lots of network/disk cards that allocate loads of buffers. + * We'll have to revisit this if we ever support PAE (64GB). + */ + + rc = parseelfimage(image_start, image_len, &dsi); + if ( rc != 0 ) + return rc; + + /* Set up domain options */ + if ( dsi.use_writable_pagetables ) + vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 ) + { + printk("Initial guest OS must load to a page boundary.\n"); + return -EINVAL; + } + + /* + * Why do we need this? The number of page-table frames depends on the + * size of the bootstrap address space. But the size of the address space + * depends on the number of page-table frames (since each one is mapped + * read-only). We have a pair of simultaneous equations in two unknowns, + * which we solve by exhaustive search. + */ + vinitrd_start = round_pgup(dsi.v_kernend); + vinitrd_end = vinitrd_start + initrd_len; + vphysmap_start = round_pgup(vinitrd_end); + vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long)); + vpt_start = round_pgup(vphysmap_end); + for ( nr_pt_pages = 2; ; nr_pt_pages++ ) + { + vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); + vstartinfo_start = vpt_end; + vstartinfo_end = vstartinfo_start + PAGE_SIZE; + vstack_start = vstartinfo_end; + vstack_end = vstack_start + PAGE_SIZE; + v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1); + if ( (v_end - vstack_end) < (512 << 10) ) + v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */ + if ( (((v_end - dsi.v_start + ((1<> + L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) + break; + } + + printk("PHYSICAL MEMORY ARRANGEMENT:\n" + " Kernel image: %p->%p\n" + " Initrd image: %p->%p\n" + " Dom0 alloc.: %08lx->%08lx\n", + image_start, image_start + image_len, + initrd_start, initrd_start + initrd_len, + alloc_start, alloc_end); + printk("VIRTUAL MEMORY ARRANGEMENT:\n" + " Loaded kernel: %08lx->%08lx\n" + " Init. ramdisk: %08lx->%08lx\n" + " Phys-Mach map: %08lx->%08lx\n" + " Page tables: %08lx->%08lx\n" + " Start info: %08lx->%08lx\n" + " Boot stack: %08lx->%08lx\n" + " TOTAL: %08lx->%08lx\n", + dsi.v_kernstart, dsi.v_kernend, + vinitrd_start, vinitrd_end, + vphysmap_start, vphysmap_end, + vpt_start, vpt_end, + vstartinfo_start, vstartinfo_end, + vstack_start, vstack_end, + dsi.v_start, v_end); + printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry); + + if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) ) + { + printk("Initial guest OS requires too much space\n" + "(%luMB is greater than %luMB limit)\n", + (v_end-dsi.v_start)>>20, (nr_pages<>20); + return -ENOMEM; + } + + /* + * Protect the lowest 1GB of memory. We use a temporary mapping there + * from which we copy the kernel and ramdisk images. + */ + if ( dsi.v_start < (1<<30) ) + { + printk("Initial loading isn't allowed to lowest 1GB of memory.\n"); + return -EINVAL; + } + + /* Paranoia: scrub DOM0's memory allocation. */ + printk("Scrubbing DOM0 RAM: "); + dst = (char *)alloc_start; + while ( dst < (char *)alloc_end ) + { +#define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */ + printk("."); + touch_nmi_watchdog(); + if ( ((char *)alloc_end - dst) > SCRUB_BYTES ) + { + memset(dst, 0, SCRUB_BYTES); + dst += SCRUB_BYTES; + } + else + { + memset(dst, 0, (char *)alloc_end - dst); + break; + } + } + printk("done.\n"); + + /* Construct a frame-allocation list for the initial domain. */ + for ( mfn = (alloc_start>>PAGE_SHIFT); + mfn < (alloc_end>>PAGE_SHIFT); + mfn++ ) + { + page = &frame_table[mfn]; + page_set_owner(page, p); + page->u.inuse.type_info = 0; + page->count_info = PGC_allocated | 1; + list_add_tail(&page->list, &p->page_list); + p->tot_pages++; p->max_pages++; + } + + mpt_alloc = (vpt_start - dsi.v_start) + alloc_start; + + SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES); + SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS); + + /* + * We're basically forcing default RPLs to 1, so that our "what privilege + * level are we returning to?" logic works. + */ + ed->thread.failsafe_selector = FLAT_GUESTOS_CS; + ed->thread.event_selector = FLAT_GUESTOS_CS; + ed->thread.guestos_ss = FLAT_GUESTOS_DS; + for ( i = 0; i < 256; i++ ) + ed->thread.traps[i].cs = FLAT_GUESTOS_CS; + + /* WARNING: The new domain must have its 'processor' field filled in! */ + l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; + memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE); + l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR); + l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR); + ed->mm.pagetable = mk_pagetable((unsigned long)l2start); + + l2tab += l2_table_offset(dsi.v_start); + mfn = alloc_start >> PAGE_SHIFT; + for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ ) + { + if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) + { + l1start = l1tab = (l1_pgentry_t *)mpt_alloc; + mpt_alloc += PAGE_SIZE; + *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT); + clear_page(l1tab); + if ( count == 0 ) + l1tab += l1_table_offset(dsi.v_start); + } + *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT); + + page = &frame_table[mfn]; + if ( !get_page_and_type(page, p, PGT_writable_page) ) + BUG(); + + mfn++; + } + + /* Pages that are part of page tables must be read only. */ + l2tab = l2start + l2_table_offset(vpt_start); + l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab); + l1tab += l1_table_offset(vpt_start); + l2tab++; + for ( count = 0; count < nr_pt_pages; count++ ) + { + *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); + page = &frame_table[l1_pgentry_to_pagenr(*l1tab)]; + if ( count == 0 ) + { + page->u.inuse.type_info &= ~PGT_type_mask; + page->u.inuse.type_info |= PGT_l2_page_table; + + /* + * No longer writable: decrement the type_count. + * Installed as CR3: increment both the ref_count and type_count. + * Net: just increment the ref_count. + */ + get_page(page, p); /* an extra ref because of readable mapping */ + + /* Get another ref to L2 page so that it can be pinned. */ + if ( !get_page_and_type(page, p, PGT_l2_page_table) ) + BUG(); + set_bit(_PGT_pinned, &page->u.inuse.type_info); + } + else + { + page->u.inuse.type_info &= ~PGT_type_mask; + page->u.inuse.type_info |= PGT_l1_page_table; + page->u.inuse.type_info |= + ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<shared_info->domain_time = 0; + /* Mask all upcalls... */ + for ( i = 0; i < MAX_VIRT_CPUS; i++ ) + p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; + p->shared_info->n_vcpu = smp_num_cpus; + + /* Install the new page tables. */ + __cli(); + write_ptbase(&ed->mm); + + /* Copy the OS image. */ + (void)loadelfimage(image_start); + + /* Copy the initial ramdisk. */ + if ( initrd_len != 0 ) + memcpy((void *)vinitrd_start, initrd_start, initrd_len); + + /* Set up start info area. */ + si = (start_info_t *)vstartinfo_start; + memset(si, 0, PAGE_SIZE); + si->nr_pages = p->tot_pages; + si->shared_info = virt_to_phys(p->shared_info); + si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; + si->pt_base = vpt_start; + si->nr_pt_frames = nr_pt_pages; + si->mfn_list = vphysmap_start; + + /* Write the phys->machine and machine->phys table entries. */ + for ( pfn = 0; pfn < p->tot_pages; pfn++ ) + { + mfn = pfn + (alloc_start>>PAGE_SHIFT); +#ifndef NDEBUG +#define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT) + if ( pfn > REVERSE_START ) + mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START); +#endif + ((unsigned long *)vphysmap_start)[pfn] = mfn; + machine_to_phys_mapping[mfn] = pfn; + } + + if ( initrd_len != 0 ) + { + si->mod_start = vinitrd_start; + si->mod_len = initrd_len; + printk("Initrd len 0x%lx, start at 0x%08lx\n", + si->mod_len, si->mod_start); + } + + dst = si->cmd_line; + if ( cmdline != NULL ) + { + for ( i = 0; i < 255; i++ ) + { + if ( cmdline[i] == '\0' ) + break; + *dst++ = cmdline[i]; + } + } + *dst = '\0'; + + /* Reinstate the caller's page tables. */ + write_ptbase(¤t->mm); + __sti(); + + /* Destroy low mappings - they were only for our convenience. */ + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE ) + l2start[i] = mk_l2_pgentry(0); + zap_low_mappings(); /* Do the same for the idle page tables. */ + + /* DOM0 gets access to everything. */ + physdev_init_dom0(p); + + set_bit(DF_CONSTRUCTED, &p->d_flags); + + new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start); + +#if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */ + shadow_lock(&p->mm); + shadow_mode_enable(p, SHM_test); + shadow_unlock(&p->mm); +#endif + + return 0; +} diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c index 5102488874..c2dee7059c 100644 --- a/xen/arch/x86/x86_32/mm.c +++ b/xen/arch/x86/x86_32/mm.c @@ -151,13 +151,13 @@ void subarch_init_memory(struct domain *dom_xen) * 64-bit operations on them. Also, just for sanity, we assert the size * of the structure here. */ - if ( (offsetof(struct pfn_info, u.inuse.domain) != + if ( (offsetof(struct pfn_info, u.inuse._domain) != (offsetof(struct pfn_info, count_info) + sizeof(u32))) || (sizeof(struct pfn_info) != 24) ) { printk("Weird pfn_info layout (%ld,%ld,%d)\n", offsetof(struct pfn_info, count_info), - offsetof(struct pfn_info, u.inuse.domain), + offsetof(struct pfn_info, u.inuse._domain), sizeof(struct pfn_info)); for ( ; ; ) ; } @@ -167,11 +167,11 @@ void subarch_init_memory(struct domain *dom_xen) idle_pg_table[l2_table_offset(RDWR_MPT_VIRT_START)]); for ( i = 0; i < 1024; i++ ) { - frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1; + frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1; /* gdt to make sure it's only mapped read-only by non-privileged domains. */ frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1; - frame_table[m2p_start_mfn+i].u.inuse.domain = dom_xen; + page_set_owner(&frame_table[m2p_start_mfn+i], dom_xen); } } diff --git a/xen/arch/x86/x86_64/domain_build.c b/xen/arch/x86/x86_64/domain_build.c new file mode 100644 index 0000000000..08a423944c --- /dev/null +++ b/xen/arch/x86/x86_64/domain_build.c @@ -0,0 +1,391 @@ +/****************************************************************************** + * domain_build.c + * + * Copyright (c) 2002-2005, K A Fraser + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Allow ring-3 access in long mode as guest cannot use ring 1. */ +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER) +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) +#define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) +#define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) + +#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) +#define round_pgdown(_p) ((_p)&PAGE_MASK) + +int construct_dom0(struct domain *p, + unsigned long alloc_start, + unsigned long alloc_end, + char *image_start, unsigned long image_len, + char *initrd_start, unsigned long initrd_len, + char *cmdline) +{ + char *dst; + int i, rc; + unsigned long pfn, mfn; + unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT; + unsigned long nr_pt_pages; + unsigned long count; + l2_pgentry_t *l2tab, *l2start; + l1_pgentry_t *l1tab = NULL, *l1start = NULL; + struct pfn_info *page = NULL; + start_info_t *si; + struct exec_domain *ed = p->exec_domain[0]; + + /* + * This fully describes the memory layout of the initial domain. All + * *_start address are page-aligned, except v_start (and v_end) which are + * superpage-aligned. + */ + struct domain_setup_info dsi; + unsigned long vinitrd_start; + unsigned long vinitrd_end; + unsigned long vphysmap_start; + unsigned long vphysmap_end; + unsigned long vstartinfo_start; + unsigned long vstartinfo_end; + unsigned long vstack_start; + unsigned long vstack_end; + unsigned long vpt_start; + unsigned long vpt_end; + unsigned long v_end; + + /* Machine address of next candidate page-table page. */ + unsigned long mpt_alloc; + + extern void physdev_init_dom0(struct domain *); + + /* Sanity! */ + if ( p->id != 0 ) + BUG(); + if ( test_bit(DF_CONSTRUCTED, &p->d_flags) ) + BUG(); + + memset(&dsi, 0, sizeof(struct domain_setup_info)); + + printk("*** LOADING DOMAIN 0 ***\n"); + + /* + * This is all a bit grim. We've moved the modules to the "safe" physical + * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this + * routine we're going to copy it down into the region that's actually + * been allocated to domain 0. This is highly likely to be overlapping, so + * we use a forward copy. + * + * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with + * 4GB and lots of network/disk cards that allocate loads of buffers. + * We'll have to revisit this if we ever support PAE (64GB). + */ + + rc = parseelfimage(image_start, image_len, &dsi); + if ( rc != 0 ) + return rc; + + /* Set up domain options */ + if ( dsi.use_writable_pagetables ) + vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 ) + { + printk("Initial guest OS must load to a page boundary.\n"); + return -EINVAL; + } + + /* + * Why do we need this? The number of page-table frames depends on the + * size of the bootstrap address space. But the size of the address space + * depends on the number of page-table frames (since each one is mapped + * read-only). We have a pair of simultaneous equations in two unknowns, + * which we solve by exhaustive search. + */ + vinitrd_start = round_pgup(dsi.v_kernend); + vinitrd_end = vinitrd_start + initrd_len; + vphysmap_start = round_pgup(vinitrd_end); + vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long)); + vpt_start = round_pgup(vphysmap_end); + for ( nr_pt_pages = 2; ; nr_pt_pages++ ) + { + vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); + vstartinfo_start = vpt_end; + vstartinfo_end = vstartinfo_start + PAGE_SIZE; + vstack_start = vstartinfo_end; + vstack_end = vstack_start + PAGE_SIZE; + v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1); + if ( (v_end - vstack_end) < (512 << 10) ) + v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */ + if ( (((v_end - dsi.v_start + ((1<> + L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) + break; + } + + printk("PHYSICAL MEMORY ARRANGEMENT:\n" + " Kernel image: %p->%p\n" + " Initrd image: %p->%p\n" + " Dom0 alloc.: %08lx->%08lx\n", + image_start, image_start + image_len, + initrd_start, initrd_start + initrd_len, + alloc_start, alloc_end); + printk("VIRTUAL MEMORY ARRANGEMENT:\n" + " Loaded kernel: %08lx->%08lx\n" + " Init. ramdisk: %08lx->%08lx\n" + " Phys-Mach map: %08lx->%08lx\n" + " Page tables: %08lx->%08lx\n" + " Start info: %08lx->%08lx\n" + " Boot stack: %08lx->%08lx\n" + " TOTAL: %08lx->%08lx\n", + dsi.v_kernstart, dsi.v_kernend, + vinitrd_start, vinitrd_end, + vphysmap_start, vphysmap_end, + vpt_start, vpt_end, + vstartinfo_start, vstartinfo_end, + vstack_start, vstack_end, + dsi.v_start, v_end); + printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry); + + if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) ) + { + printk("Initial guest OS requires too much space\n" + "(%luMB is greater than %luMB limit)\n", + (v_end-dsi.v_start)>>20, (nr_pages<>20); + return -ENOMEM; + } + + /* + * Protect the lowest 1GB of memory. We use a temporary mapping there + * from which we copy the kernel and ramdisk images. + */ + if ( dsi.v_start < (1<<30) ) + { + printk("Initial loading isn't allowed to lowest 1GB of memory.\n"); + return -EINVAL; + } + + /* Paranoia: scrub DOM0's memory allocation. */ + printk("Scrubbing DOM0 RAM: "); + dst = (char *)alloc_start; + while ( dst < (char *)alloc_end ) + { +#define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */ + printk("."); + touch_nmi_watchdog(); + if ( ((char *)alloc_end - dst) > SCRUB_BYTES ) + { + memset(dst, 0, SCRUB_BYTES); + dst += SCRUB_BYTES; + } + else + { + memset(dst, 0, (char *)alloc_end - dst); + break; + } + } + printk("done.\n"); + + /* Construct a frame-allocation list for the initial domain. */ + for ( mfn = (alloc_start>>PAGE_SHIFT); + mfn < (alloc_end>>PAGE_SHIFT); + mfn++ ) + { + page = &frame_table[mfn]; + page_set_owner(page, p); + page->u.inuse.type_info = 0; + page->count_info = PGC_allocated | 1; + list_add_tail(&page->list, &p->page_list); + p->tot_pages++; p->max_pages++; + } + + mpt_alloc = (vpt_start - dsi.v_start) + alloc_start; + + SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES); + SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS); + + /* + * We're basically forcing default RPLs to 1, so that our "what privilege + * level are we returning to?" logic works. + */ + ed->thread.failsafe_selector = FLAT_GUESTOS_CS; + ed->thread.event_selector = FLAT_GUESTOS_CS; + ed->thread.guestos_ss = FLAT_GUESTOS_DS; + for ( i = 0; i < 256; i++ ) + ed->thread.traps[i].cs = FLAT_GUESTOS_CS; + + /* WARNING: The new domain must have its 'processor' field filled in! */ + l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; + memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE); + l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR); + l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR); + ed->mm.pagetable = mk_pagetable((unsigned long)l2start); + + l2tab += l2_table_offset(dsi.v_start); + mfn = alloc_start >> PAGE_SHIFT; + for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ ) + { + if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) + { + l1start = l1tab = (l1_pgentry_t *)mpt_alloc; + mpt_alloc += PAGE_SIZE; + *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT); + clear_page(l1tab); + if ( count == 0 ) + l1tab += l1_table_offset(dsi.v_start); + } + *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT); + + page = &frame_table[mfn]; + if ( !get_page_and_type(page, p, PGT_writable_page) ) + BUG(); + + mfn++; + } + + /* Pages that are part of page tables must be read only. */ + l2tab = l2start + l2_table_offset(vpt_start); + l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab); + l1tab += l1_table_offset(vpt_start); + l2tab++; + for ( count = 0; count < nr_pt_pages; count++ ) + { + *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); + page = &frame_table[l1_pgentry_to_pagenr(*l1tab)]; + if ( count == 0 ) + { + page->u.inuse.type_info &= ~PGT_type_mask; + page->u.inuse.type_info |= PGT_l2_page_table; + + /* + * No longer writable: decrement the type_count. + * Installed as CR3: increment both the ref_count and type_count. + * Net: just increment the ref_count. + */ + get_page(page, p); /* an extra ref because of readable mapping */ + + /* Get another ref to L2 page so that it can be pinned. */ + if ( !get_page_and_type(page, p, PGT_l2_page_table) ) + BUG(); + set_bit(_PGT_pinned, &page->u.inuse.type_info); + } + else + { + page->u.inuse.type_info &= ~PGT_type_mask; + page->u.inuse.type_info |= PGT_l1_page_table; + page->u.inuse.type_info |= + ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<shared_info->domain_time = 0; + /* Mask all upcalls... */ + for ( i = 0; i < MAX_VIRT_CPUS; i++ ) + p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; + p->shared_info->n_vcpu = smp_num_cpus; + + /* Install the new page tables. */ + __cli(); + write_ptbase(&ed->mm); + + /* Copy the OS image. */ + (void)loadelfimage(image_start); + + /* Copy the initial ramdisk. */ + if ( initrd_len != 0 ) + memcpy((void *)vinitrd_start, initrd_start, initrd_len); + + /* Set up start info area. */ + si = (start_info_t *)vstartinfo_start; + memset(si, 0, PAGE_SIZE); + si->nr_pages = p->tot_pages; + si->shared_info = virt_to_phys(p->shared_info); + si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; + si->pt_base = vpt_start; + si->nr_pt_frames = nr_pt_pages; + si->mfn_list = vphysmap_start; + + /* Write the phys->machine and machine->phys table entries. */ + for ( pfn = 0; pfn < p->tot_pages; pfn++ ) + { + mfn = pfn + (alloc_start>>PAGE_SHIFT); +#ifndef NDEBUG +#define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT) + if ( pfn > REVERSE_START ) + mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START); +#endif + ((unsigned long *)vphysmap_start)[pfn] = mfn; + machine_to_phys_mapping[mfn] = pfn; + } + + if ( initrd_len != 0 ) + { + si->mod_start = vinitrd_start; + si->mod_len = initrd_len; + printk("Initrd len 0x%lx, start at 0x%08lx\n", + si->mod_len, si->mod_start); + } + + dst = si->cmd_line; + if ( cmdline != NULL ) + { + for ( i = 0; i < 255; i++ ) + { + if ( cmdline[i] == '\0' ) + break; + *dst++ = cmdline[i]; + } + } + *dst = '\0'; + + /* Reinstate the caller's page tables. */ + write_ptbase(¤t->mm); + __sti(); + + /* Destroy low mappings - they were only for our convenience. */ + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE ) + l2start[i] = mk_l2_pgentry(0); + zap_low_mappings(); /* Do the same for the idle page tables. */ + + /* DOM0 gets access to everything. */ + physdev_init_dom0(p); + + set_bit(DF_CONSTRUCTED, &p->d_flags); + + new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start); + +#if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */ + shadow_lock(&p->mm); + shadow_mode_enable(p, SHM_test); + shadow_unlock(&p->mm); +#endif + + return 0; +} diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index e57208009f..7d94cbc896 100644 --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -171,6 +171,21 @@ void subarch_init_memory(struct domain *dom_xen) l3_pgentry_t l3e; l2_pgentry_t l2e; + /* + * We are rather picky about the layout of 'struct pfn_info'. The + * count_info and domain fields must be adjacent, as we perform atomic + * 64-bit operations on them. + */ + if ( (offsetof(struct pfn_info, u.inuse._domain) != + (offsetof(struct pfn_info, count_info) + sizeof(u32))) ) + { + printk("Weird pfn_info layout (%ld,%ld,%d)\n", + offsetof(struct pfn_info, count_info), + offsetof(struct pfn_info, u.inuse._domain), + sizeof(struct pfn_info)); + for ( ; ; ) ; + } + /* M2P table is mappable read-only by privileged domains. */ for ( v = RDWR_MPT_VIRT_START; v != RDWR_MPT_VIRT_END; @@ -187,11 +202,11 @@ void subarch_init_memory(struct domain *dom_xen) for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) { - frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1; + frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1; /* gdt to make sure it's only mapped read-only by non-privileged domains. */ frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1; - frame_table[m2p_start_mfn+i].u.inuse.domain = dom_xen; + page_set_owner(&frame_table[m2p_start_mfn+i], dom_xen); } } } diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index eae1692eb1..f57ddbe17d 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -418,7 +418,7 @@ unsigned long alloc_xenheap_pages(unsigned int order) for ( i = 0; i < (1 << order); i++ ) { pg[i].count_info = 0; - pg[i].u.inuse.domain = NULL; + pg[i].u.inuse._domain = 0; pg[i].u.inuse.type_info = 0; } @@ -501,7 +501,7 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, unsigned int order) } pg[i].count_info = 0; - pg[i].u.inuse.domain = NULL; + pg[i].u.inuse._domain = 0; pg[i].u.inuse.type_info = 0; } @@ -529,7 +529,7 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, unsigned int order) for ( i = 0; i < (1 << order); i++ ) { - pg[i].u.inuse.domain = d; + page_set_owner(&pg[i], d); wmb(); /* Domain pointer must be visible before updating refcnt. */ pg[i].count_info |= PGC_allocated | 1; list_add_tail(&pg[i].list, &d->page_list); @@ -544,7 +544,7 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, unsigned int order) void free_domheap_pages(struct pfn_info *pg, unsigned int order) { int i, drop_dom_ref; - struct domain *d = pg->u.inuse.domain; + struct domain *d = page_get_owner(pg); struct exec_domain *ed; void *p; int cpu_mask = 0; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 9bea940db3..a10fa83ae9 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -30,6 +30,9 @@ struct pfn_info /* Each frame can be threaded onto a doubly-linked list. */ struct list_head list; + /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */ + u32 tlbflush_timestamp; + /* Reference count and various PGC_xxx flags and fields. */ u32 count_info; @@ -39,24 +42,22 @@ struct pfn_info /* Page is in use: ((count_info & PGC_count_mask) != 0). */ struct { /* Owner of this page (NULL if page is anonymous). */ - struct domain *domain; + u32 _domain; /* pickled format */ /* Type reference count and various PGT_xxx flags and fields. */ u32 type_info; - } inuse; + } PACKED inuse; /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */ struct { /* Mask of possibly-tainted TLBs. */ - unsigned long cpu_mask; + u32 cpu_mask; /* Order-size of the free chunk this page is the head of. */ u8 order; - } free; + } PACKED free; - } u; + } PACKED u; - /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */ - u32 tlbflush_timestamp; -}; +} PACKED; /* The following page types are MUTUALLY EXCLUSIVE. */ #define PGT_none (0<<29) /* no special uses of this page */ @@ -97,9 +98,25 @@ struct pfn_info #define IS_XEN_HEAP_FRAME(_pfn) (page_to_phys(_pfn) < xenheap_phys_end) +#if defined(__i386__) + +#define pickle_domptr(_d) ((u32)(unsigned long)(_d)) +#define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d)) + +#elif defined(__x86_64__) +static inline struct domain *unpickle_domptr(u32 _domain) +{ return (_domain == 0) ? NULL : __va(_domain); } +static inline u32 pickle_domptr(struct domain *domain) +{ return (domain == NULL) ? 0 : (u32)__pa(domain); } + +#endif + +#define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) +#define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d)) + #define SHARE_PFN_WITH_DOMAIN(_pfn, _dom) \ do { \ - (_pfn)->u.inuse.domain = (_dom); \ + page_set_owner((_pfn), (_dom)); \ /* The incremented type count is intended to pin to 'writable'. */ \ (_pfn)->u.inuse.type_info = PGT_writable_page | PGT_validated | 1; \ wmb(); /* install valid domain ptr before updating refcnt. */ \ @@ -142,7 +159,8 @@ static inline int get_page(struct pfn_info *page, struct domain *domain) { u32 x, nx, y = page->count_info; - struct domain *d, *nd = page->u.inuse.domain; + u32 d, nd = page->u.inuse._domain; + u32 _domain = pickle_domptr(domain); do { x = y; @@ -150,10 +168,10 @@ static inline int get_page(struct pfn_info *page, d = nd; if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */ unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ - unlikely(d != domain) ) /* Wrong owner? */ + unlikely(d != _domain) ) /* Wrong owner? */ { DPRINTK("Error pfn %08lx: ed=%p, sd=%p, caf=%08x, taf=%08x\n", - page_to_pfn(page), domain, d, + page_to_pfn(page), domain, unpickle_domptr(d), x, page->u.inuse.type_info); return 0; } @@ -198,7 +216,7 @@ static inline int get_page_and_type(struct pfn_info *page, ASSERT(((_p)->u.inuse.type_info & PGT_count_mask) != 0) #define ASSERT_PAGE_IS_DOMAIN(_p, _d) \ ASSERT(((_p)->count_info & PGC_count_mask) != 0); \ - ASSERT((_p)->u.inuse.domain == (_d)) + ASSERT(page_get_owner(_p) == (_d)) int check_descriptor(unsigned long *d); diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index 9705e9ebfa..b43b11e583 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -189,7 +189,7 @@ static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn) SH_LOG("mark_dirty OOR! mfn=%x pfn=%lx max=%x (mm %p)", mfn, pfn, m->shadow_dirty_bitmap_size, m ); SH_LOG("dom=%p caf=%08x taf=%08x\n", - frame_table[mfn].u.inuse.domain, + page_get_owner(&frame_table[mfn]), frame_table[mfn].count_info, frame_table[mfn].u.inuse.type_info ); } -- 2.30.2